FictionEro - Data Cleaning

Data Preparation

Code
library(tidyverse)
library(easystats)
library(patchwork)
library(ggside)
df <- read.csv("../data/rawdata_participants.csv") |> 
  mutate(across(everything(), ~ifelse(.x == "", NA, .x))) |>
  mutate(Experimenter = case_when(
    Language=="English" & Experimenter == "reddit7" ~ "Reddit (other)",
    Language=="English" & Experimenter == "reddit8" ~ "Reddit (other)",
    Language=="English" & Experimenter == "reddit1" ~ "Reddit (other)",
    .default = Experimenter
  ))

dftask <- read.csv("../data/rawdata_task.csv") |> 
  full_join(
    df[c("Participant", "Sex", "SexualOrientation")],
    by = join_by(Participant)
    )

The initial sample consisted of 409 participants (Mean age = 35.7, SD = 12.4, range: [18, 80]; Sex: 18.3% females, 80.4% males, 1.2% other; Education: Bachelor, 38.88%; Doctorate, 7.33%; High School, 23.47%; Master, 28.12%; Other, 1.47%; Primary School, 0.73%; Country: 28.61% USA, 13.45% France, 13.20% UK, 44.74% other).

Compute Scores

# Create Sexual "relevance" variable (Relevant, irrelevant, non-erotic)
dftask <- dftask |> 
  mutate(Relevance = case_when(
    Type == "Non-erotic" ~ "Non-erotic",
    Sex == "Male" & SexualOrientation == "Heterosexual" & Category == "Female" ~ "Relevant",
    Sex == "Female" & SexualOrientation == "Heterosexual" & Category == "Male" ~ "Relevant",
    Sex == "Male" & SexualOrientation == "Homosexual" & Category == "Male" ~ "Relevant",
    Sex == "Female" & SexualOrientation == "Homosexual" & Category == "Female" ~ "Relevant",
    # TODO: what to do with "Other"? 
    SexualOrientation %in% c("Bisexual", "Other") & Category %in% c("Male", "Female") ~ "Relevant",
    .default = "Irrelevant"
  )) 

Recruitment History

Code
# Consecutive count of participants per day (as area)
df |>
  mutate(Date = as.Date(Date, format = "%d/%m/%Y")) |> 
  group_by(Date, Language, Experimenter) |> 
  summarize(N = n()) |> 
  ungroup() |>
  # https://bocoup.com/blog/padding-time-series-with-r
  complete(Date, Language, Experimenter, fill = list(N = 0)) |> 
  group_by(Language, Experimenter) |>
  mutate(N = cumsum(N)) |>
  ggplot(aes(x = Date, y = N)) +
  geom_area(aes(fill=Experimenter)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    title = "Recruitment History",
    x = "Date",
    y = "Total Number of Participants"
  ) +
  facet_wrap(~Language, nrow=3) +
  see::theme_modern() 

Code
# Table
summarize(df, N = n(), .by=c("Language", "Experimenter")) |> 
  arrange(desc(N)) |> 
  gt::gt() |> 
  gt::opt_stylize() |> 
  gt::opt_interactive(use_compact_mode = TRUE) |> 
  gt::tab_header("Number of participants per recruitment source")
Number of participants per recruitment source

Feedback

Evaluation

The majority of participants found it to be a “fun” experience. It is interesting to note that reports of “fun” were significantly associated with finding (some) stimuli arousing. Conversely, reporting “no feelings” was associated with finding the experiment “boring”.

Code
df |> 
  select(starts_with("Feedback"), -Feedback_Comments) |>
  pivot_longer(everything(), names_to = "Question", values_to = "Answer") |>
  group_by(Question, Answer) |> 
  summarise(prop = n()/nrow(df), .groups = 'drop') |> 
  complete(Question, Answer, fill = list(prop = 0)) |> 
  filter(Answer == "True") |> 
  mutate(Question = str_remove(Question, "Feedback_"),
         Question = str_replace(Question, "AILessArousing", "AI = Less arousing"),
         Question = str_replace(Question, "AIMoreArousing", "AI = More arousing"),
         Question = str_replace(Question, "CouldNotDiscriminate", "Hard to discriminate"),
         Question = str_replace(Question, "LabelsIncorrect", "Labels were incorrect"),
         Question = str_replace(Question, "NoFeels", "Didn't feel anything"),
         Question = str_replace(Question, "CouldDiscriminate", "Easy to discriminate"),
         Question = str_replace(Question, "LabelsReversed", "Labels were reversed")) |>
  mutate(Question = fct_reorder(Question, desc(prop))) |> 
  ggplot(aes(x = Question, y = prop)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks(), labels=scales::percent) +
  labs(x="Feedback", y = "Participants", title = "Feedback") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1), angle = 45, hjust = 1),
    axis.title.x = element_blank()
  )

Code
cor <- df |> 
  select(starts_with("Feedback"), -Feedback_Comments) |> 
  mutate_all(~ifelse(.=="True", 1, 0)) |> 
  correlation(method="tetrachoric", redundant = TRUE) |> 
  correlation::cor_sort() |> 
  correlation::cor_lower()
For i = 2 j = 1  A cell entry of 0 was replaced with correct =  0.5.  Check your data!
For i = 2 j = 1  A cell entry of 0 was replaced with correct =  0.5.  Check your data!
Code
cor |> 
  mutate(val = paste0(insight::format_value(rho), format_p(p, stars_only=TRUE))) |>
  mutate(Parameter2 = fct_rev(Parameter2)) |>
  mutate(Parameter1 = fct_relabel(Parameter1, \(x) str_remove_all(x, "Feedback_")),
         Parameter2 = fct_relabel(Parameter2, \(x) str_remove_all(x, "Feedback_"))) |>
  ggplot(aes(x=Parameter1, y=Parameter2)) +
  geom_tile(aes(fill = rho), color = "white") +
  geom_text(aes(label = val), size = 3) +
  labs(title = "Feedback Co-occurence Matrix") +
  scale_fill_gradient2(
    low = "#2196F3",
    mid = "white",
    high = "#F44336",
    breaks = c(-1, 0, 1),
    guide = guide_colourbar(ticks=FALSE),
    midpoint = 0,
    na.value = "grey85",
    limit = c(-1, 1))  + 
  theme_minimal() +
  theme(legend.title = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

Comments

Code
data.frame(Language = df$Language,
           Source = df$Experimenter,
           Comments = trimws(df$Feedback_Comments)) |> 
  filter(!Comments %in% c(NA, "No", "Nope", "nope", "None", "na", "n/a", "Non")) |> 
  arrange(Language, Source) |>
  gt::gt() |> 
  gt::opt_stylize() |> 
  gt::opt_interactive(use_compact_mode = TRUE) 

Exclusion

outliers <- c(
  # "399"  # Negative Arousal-Valence correlations
  )
potentials <- list()

Mobile

Code
df |>
  ggplot(aes(x=Mobile, fill=Mobile)) +
  geom_bar() +
  geom_hline(yintercept=0.5*nrow(df), linetype="dashed") +
  theme_modern()

We removed 145 participants that participated with a mobile device.

Code
df <- filter(df, Mobile == "False")
dftask <- filter(dftask, Participant %in% df$Participant)

Experiment Duration

The experiment’s median duration is 23.90 min (50% HDI [18.39, 24.96]).

Code
df |>
  mutate(Participant = fct_reorder(Participant, Experiment_Duration),
         Category = ifelse(Experiment_Duration > 60, "extra", "ok"),
         Duration = ifelse(Experiment_Duration > 60, 60, Experiment_Duration),
         Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |>
  ggplot(aes(y = Participant, x = Duration)) +
  geom_point(aes(color = Group, shape = Category)) +
  geom_vline(xintercept = median(df$Experiment_Duration), color = "red", linetype = "dashed") +
  scale_shape_manual(values = c("extra" = 3, ok = 19)) +
  scale_color_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  guides(color = "none", shape = "none") +
  ggside::geom_xsidedensity(fill = "#4CAF50", color=NA) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  labs(
    title = "Experiment Completion Time",
    x = "Duration (in minutes)",
    y = "Participant"
  )  +
  theme_bw() +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .3,
        panel.border = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

Code
potentials$expe_duration <- arrange(df, Experiment_Duration) |>
  select(Participant, Experiment_Duration) |>
  head(5) 

Task Duration

Code
plot_hist <- function(dat) {
  dens <- rbind(
    mutate(bayestestR::estimate_density(filter(dftask, RT1 < 40 & RT2 < 40)$RT1), 
           Phase="Emotional ratings",
           y = y / max(y)),
    mutate(bayestestR::estimate_density(filter(dftask, RT1 < 40 & RT2 < 40)$RT2), 
           Phase="Reality rating",
           y = y / max(y))
  )
  
  dat |> 
    filter(RT1 < 40 & RT2 < 40) |>  # Remove very long RTs
    # mutate(Participant = fct_reorder(Participant, RT1)) |> 
    pivot_longer(cols = c(RT1, RT2), names_to = "Phase", values_to = "RT") |>
    mutate(Phase = ifelse(Phase == "RT1", "Emotional ratings", "Reality rating")) |>
    ggplot(aes(x=RT)) +
    geom_area(data=dens, aes(x=x, y=y, fill=Phase), alpha=0.33, position="identity") +
    geom_density(aes(color=Phase, y=after_stat(scaled)), linewidth=1.5) + 
    scale_x_sqrt(breaks=c(0, 2, 5, 10, 20)) +
    theme_minimal() +
    theme(axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.text.y = element_blank(),
          axis.line.y = element_blank()) +
    labs(title = "Distribution of Response Time for each Participant", x="Response time per stimuli (s)") +
    facet_wrap(~Participant, ncol=5, scales="free_y") +
    coord_cartesian(xlim = c(0, 25))
}
Code
plot_hist(dftask[dftask$Participant %in% df$Participant[1:60], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[61:120], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[121:180], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[181:240], ])

Code
plot_hist(dftask[dftask$Participant %in% df$Participant[241:264], ])

BAIT Questionnaire Duration

Code
df |>
  mutate(Participant = fct_reorder(Participant, BAIT_Duration),
         Category = ifelse(BAIT_Duration > 5, "extra", "ok"),
         Duration = ifelse(BAIT_Duration > 5, 5, BAIT_Duration),
         Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |>
  ggplot(aes(y = Participant, x = Duration)) +
  geom_point(aes(color = Group, shape = Category)) +
  geom_vline(xintercept = median(df$BAIT_Duration), color = "red", linetype = "dashed") +
  scale_shape_manual(values = c("extra" = 3, ok = 19)) +
  scale_color_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  guides(color = "none", shape = "none") +
  ggside::geom_xsidedensity(fill = "#9C27B0", color=NA) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  labs(
    title = "Questionnaire Completion Time",
    x = "Duration (in minutes)",
    y = "Participant"
  )  +
  theme_bw() +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .3,
        panel.border = element_blank(),
        axis.ticks.y = element_blank(),
          axis.text.y = element_blank()) 

Response to Erotic Stimuli

Code
dat <- dftask |> 
  filter(Relevance %in% c("Relevant", "Non-erotic")) |> 
  group_by(Participant, Type) |> 
  summarise(Arousal = mean(Arousal), 
            Valence = mean(Valence),
            Enticement = mean(Enticement),
            .groups = "drop") |>
  pivot_wider(names_from = Type, values_from = all_of(c("Arousal", "Valence", "Enticement"))) |>
  transmute(Participant = Participant,
            Arousal = Arousal_Erotic - `Arousal_Non-erotic`,
            Valence = Valence_Erotic - `Valence_Non-erotic`,
            Enticement = Enticement_Erotic - `Enticement_Non-erotic`) |>
  filter(!is.na(Arousal)) |> 
  mutate(Participant = fct_reorder(Participant, Arousal)) 

dat |> 
  pivot_longer(-Participant) |> 
  mutate(Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |> 
  ggplot(aes(x=value, y=Participant, fill=Group)) +
  geom_bar(aes(fill=value), stat = "identity") +
  scale_fill_gradient2(low = "#3F51B5", mid = "#FF9800", high = "#4CAF50", midpoint = 0) +
  # scale_fill_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(title = "Difference between Erotic and Neutral", x="Erotic - Neutral") +
  facet_wrap(~name, ncol=3, scales="free_x")

Code
potentials$emo_diff <- arrange(dat, Arousal) |>
  head(5)

Response Coherence

Code
dat <- dftask |> 
  summarize(cor_ArVal = cor(Arousal, Valence),
            cor_ArEnt = cor(Arousal, Enticement),
            .by="Participant") |>
  mutate(Participant = fct_reorder(Participant, cor_ArVal)) 

dat |>
  pivot_longer(-Participant) |> 
  mutate(Group = ifelse(Participant %in% outliers, "Outlier", "ok")) |> 
  mutate(name = fct_relevel(name, "cor_ArVal", "cor_ArEnt"),
         name = fct_recode(name, "Arousal - Valence" = "cor_ArVal", "Arousal - Enticement" = "cor_ArEnt")) |>
  ggplot(aes(y = Participant, x = value, fill = Group)) +
  geom_bar(stat = "identity") +
  # scale_fill_gradient2(low = "#3F51B5", mid = "#FF9800", high = "#4CAF50", midpoint = 0) +
  scale_fill_manual(values = c("Outlier" = "red", ok = "black"), guide="none") +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(title = "Difference between Erotic and Neutral", x="Erotic - Neutral") +
  facet_wrap(~name, ncol=3, scales="free_x")

Code
potentials$emo_cor <- arrange(dat, cor_ArVal) |>
  head(5)
Code
c(as.character(potentials$expe_duration$Participant), 
  as.character(potentials$emo_diff$Participant), 
  as.character(potentials$emo_cor$Participant)) |> 
  table()

S019 S137 S168 S186 S259 S303 S305 S338 S341 S381 S391 S399 S404 
   1    1    1    1    1    1    1    1    2    1    1    2    1 

Sexual Profile

Sample

Code
df |>
  ggplot(aes(x = Sex)) +
  geom_bar(aes(fill = SexualOrientation)) +
  scale_y_continuous(expand = c(0, 0), breaks = scales::pretty_breaks()) +
  scale_fill_metro_d() +
  labs(x = "Biological Sex", y = "Number of Participants", title = "Sex and Sexual Orientation", fill = "Sexual Orientation") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )

We removed 10 participants that were incompatible with further analysis.

df <- filter(df, Sex != "Other" & SexualOrientation != "Other")
dftask <- filter(dftask, Participant %in% df$Participant)

Task Behaviour

Code
show_distribution <- function(dftask, target="Arousal") {
  dftask |> 
    filter(SexualOrientation %in% c("Heterosexual", "Bisexual", "Homosexual")) |>
    bayestestR::estimate_density(select=target, 
                                 at=c("Relevance", "Category", "Sex", "SexualOrientation"), 
                                 method="KernSmooth") |>
    ggplot(aes(x = x, y = y)) +
    geom_line(aes(color = Relevance, linetype = Category), linewidth=1) +
    facet_grid(SexualOrientation~Sex, scales="free_y")  +
    scale_color_manual(values = c("Relevant" = "red", "Non-erotic" = "blue", "Irrelevant"="darkorange")) +
    scale_y_continuous(expand = c(0, 0)) +
    scale_x_continuous(expand = c(0, 0)) +
    theme_minimal()  +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          axis.text.y = element_blank(),
          plot.title = element_text(face="bold")) +
    labs(title = target) 
}

(show_distribution(dftask, "Arousal") | show_distribution(dftask, "Valence")) /
  (show_distribution(dftask, "Enticement") | show_distribution(dftask, "Realness")) +
  patchwork::plot_layout(guides = "collect") +
  patchwork::plot_annotation(title = "Distribution of Appraisals depending on the Sexual Profile",
                             theme = theme(plot.title = element_text(hjust = 0.5, face="bold"))) 

We kept only heterosexual participants (79.92%).

df <- filter(df, SexualOrientation == "Heterosexual")
dftask <- filter(dftask, Participant %in% df$Participant)

Final Sample

Code
df <- filter(df, !Participant %in% outliers)
dftask <- filter(dftask, Participant %in% df$Participant)

The final sample includes 203 participants (Mean age = 37.9, SD = 13.4, range: [18, 80]; Sex: 15.8% females, 84.2% males, 0.0% other; Education: Bachelor, 34.98%; Doctorate, 8.87%; High School, 19.21%; Master, 34.98%; Other, 1.48%; Primary School, 0.49%; Country: 27.09% USA, 16.26% France, 11.82% UK, 44.83% other).

Code
p_country <- dplyr::select(df, region = Country) |>
  group_by(region) |>
  summarize(n = n()) |>
  right_join(map_data("world"), by = "region") |>
  ggplot(aes(long, lat, group = group)) +
  geom_polygon(aes(fill = n)) +
  scale_fill_gradientn(colors = c("#FFEB3B", "red", "purple")) +
  labs(fill = "N") +
  theme_void() +
  labs(title = "A Geographically Diverse Sample", subtitle = "Number of participants by country")  +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2))
  )
p_country

Code
ggwaffle::waffle_iron(df, ggwaffle::aes_d(group = Ethnicity), rows=10) |> 
  ggplot(aes(x, y, fill = group)) + 
  ggwaffle::geom_waffle() + 
  coord_equal() + 
  scale_fill_flat_d() + 
  ggwaffle::theme_waffle() +
  labs(title = "Self-reported Ethnicity", subtitle = "Each square represents a participant", fill="")  +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2)),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  )
Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

Code
p_age <- estimate_density(df$Age) |>
  normalize(select = y) |> 
  mutate(y = y * 86) |>  # To match the binwidth
  ggplot(aes(x = x)) +
  geom_histogram(data=df, aes(x = Age), fill = "#616161", bins=28) +
  # geom_line(aes(y = y), color = "orange", linewidth=2) +
  geom_vline(xintercept = mean(df$Age), color = "red", linewidth=1.5) +
  # geom_label(data = data.frame(x = mean(df$Age) * 1.15, y = 0.95 * 75), aes(y = y), color = "red", label = paste0("Mean = ", format_value(mean(df$Age)))) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0)) +
  labs(title = "Age", y = "Number of Participants", color = NULL, subtitle = "Distribution of participants' age") +
  theme_modern(axis.title.space = 10) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_age

Code
p_edu <- df |>
  mutate(Education = fct_relevel(Education, "Other", "Primary School", "High School", "Bachelor", "Master", "Doctorate")) |> 
  ggplot(aes(x = Education)) +
  geom_bar(aes(fill = Education)) +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  scale_fill_viridis_d(guide = "none") +
  labs(title = "Education", y = "Number of Participants", subtitle = "Participants per achieved education level") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_edu

Birth Control

Code
colors <- c(
  "NA" = "#2196F3", "None" = "#E91E63", "Condoms (for partner)" = "#9C27B0",
  "Combined pills" = "#FF9800", "Intrauterine Device (IUD)" = "#FF5722", 
  "Intrauterine System (IUS)" = "#795548", "Progestogen pills" = "#4CAF50",
  "Other" = "#FFC107", "Condoms (female)" = "#607D8B"
)
colors <- colors[names(colors) %in% c("NA", df$BirthControl)]

p_sex <- df |>
  mutate(BirthControl = ifelse(Sex == "Male", "NA", BirthControl),
         BirthControl = fct_relevel(BirthControl, names(colors))) |>
  ggplot(aes(x = Sex)) +
  geom_bar(aes(fill = BirthControl)) +
  scale_y_continuous(expand = c(0, 0), breaks = scales::pretty_breaks()) +
  scale_fill_manual(
    values = colors,
    breaks = names(colors)[2:length(colors)]
  ) +
  labs(x = "Biological Sex", y = "Number of Participants", title = "Sex and Birth Control Method", fill = "Birth Control") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1)),
    axis.title.x = element_blank()
  )
p_sex

Sexual Profile

Code
p_sexprofile <- df |>
  select(Participant, Sex, SexualOrientation, SexualActivity, COPS_Duration_1, COPS_Frequency_2) |> 
  pivot_longer(-all_of(c("Participant", "Sex"))) |> 
  mutate(name = str_replace_all(name, "SexualOrientation", "Sexual Orientation"),
         name = str_replace_all(name, "SexualActivity", "Sexual Activity"),
         name = str_replace_all(name, "COPS_Duration_1", "Pornography Usage (Duration)"),
         name = str_replace_all(name, "COPS_Frequency_2", "Pornography Usage (Frequency)")) |> 
  ggplot(aes(x = value, fill=Sex)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  scale_fill_manual(values = c("Male"= "#64B5F6", "Female"= "#F06292")) +
  labs(title = "Sexual Profile of Participants") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1), angle = 45, hjust = 1),
    axis.title.x = element_blank(),
    axis.title.y = element_blank()
  ) +
  facet_wrap(~name, scales = "free")
p_sexprofile

Code
p_language <- df |>
  ggplot(aes(x = Language_Level)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  labs(x = "Level", y = "Number of Participants", title = "Language Level") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1))
  )

p_expertise <- df |>
  ggplot(aes(x = AI_Knowledge)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0), breaks= scales::pretty_breaks()) +
  labs(x = "Level", y = "Number of Participants", title = "AI-Expertise") +
  theme_modern(axis.title.space = 15) +
  theme(
    plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0),
    plot.subtitle = element_text(size = rel(1.2), vjust = 7),
    axis.text.y = element_text(size = rel(1.1)),
    axis.text.x = element_text(size = rel(1.1))
  )

p_language | p_expertise

Code
p_country /
  (p_age + p_edu)

Beliefs about Artificial Information Technology (BAIT)

This section pertains to the validation of the BAIT scale measuring beliefs and expectations about artificial creations.

Exploratory Factor Analysis

Code
bait <- df |> 
  select(starts_with("BAIT_"), -BAIT_Duration) 


cor <- correlation::correlation(bait, redundant = TRUE) |> 
  correlation::cor_sort() |> 
  correlation::cor_lower()

clean_labels <- function(x) {
  x <- str_remove_all(x, "BAIT_") |> 
    str_replace_all("_", " - ")
}

cor |> 
  mutate(val = paste0(insight::format_value(r), format_p(p, stars_only=TRUE))) |>
  mutate(Parameter2 = fct_rev(Parameter2)) |>
  mutate(Parameter1 = fct_relabel(Parameter1, clean_labels),
         Parameter2 = fct_relabel(Parameter2, clean_labels)) |> 
  ggplot(aes(x=Parameter1, y=Parameter2)) +
  geom_tile(aes(fill = r), color = "white") +
  geom_text(aes(label = val), size = 3) +
  labs(title = "Correlation Matrix",
       subtitle = "Beliefs about Artificial Information Technology (BAIT)") +
  scale_fill_gradient2(
    low = "#2196F3",
    mid = "white",
    high = "#F44336",
    breaks = c(-1, 0, 1),
    guide = guide_colourbar(ticks=FALSE),
    midpoint = 0,
    na.value = "grey85",
    limit = c(-1, 1))  + 
  theme_minimal() +
  theme(legend.title = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

Code
n <- parameters::n_factors(bait, package = "nFactors")
plot(n)

Code
efa <- parameters::factor_analysis(bait, cor=cor(bait), n=2, rotation = "oblimin", sort=TRUE, scores="tenBerge", fm="ml")
plot(efa)

Code
display(efa)
Rotated loadings from Factor Analysis (oblimin-rotation)
Variable ML2 ML1 Complexity Uniqueness
BAIT_7_TextRealistic 0.82 -0.06 1.01 0.35
BAIT_8_TextIssues -0.67 0.03 1.00 0.55
BAIT_5_ImitatingReality 0.46 0.09 1.08 0.76
BAIT_1_ImagesRealistic 0.42 0.21 1.48 0.74
BAIT_6_EnvironmentReal 0.40 0.31 1.89 0.69
BAIT_2_ImagesIssues -0.31 -0.02 1.01 0.90
BAIT_4_VideosIssues -0.03 0.88 1.00 0.23
BAIT_3_VideosRealistic -0.16 -0.41 1.32 0.78

The 2 latent factors (oblimin rotation) accounted for 37.34% of the total variance of the original data (ML2 = 23.05%, ML1 = 14.28%).

Confirmatory Factor Analysis

Code
m1 <- lavaan::cfa(efa_to_cfa(efa, threshold="max"), data=df)
m2 <- lavaan::cfa(
  "G =~ BAIT_5_ImitatingReality + BAIT_6_EnvironmentReal + BAIT_4_VideosIssues + BAIT_8_TextIssues + BAIT_3_VideosRealistic + BAIT_1_ImagesRealistic + BAIT_2_ImagesIssues + BAIT_7_TextRealistic", 
  data=df)

# bayestestR::bayesfactor_models(m1, m2)
lavaan::anova(m1, m2)

Chi-Squared Difference Test

   Df     AIC      BIC   Chisq Chisq diff   RMSEA Df diff Pr(>Chisq)    
m1 19 -83.878 -27.5534  76.688                                          
m2 20 -60.656  -7.6446 101.910     25.222 0.34543       1  5.109e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
display(parameters::parameters(m1))
# Loading
Link Coefficient SE 95% CI z p
ML2 =~ BAIT_1_ImagesRealistic 1.00 0.00 (1.00, 1.00) < .001
ML2 =~ BAIT_7_TextRealistic 2.17 0.36 (1.47, 2.87) 6.05 < .001
ML2 =~ BAIT_2_ImagesIssues -0.88 0.23 (-1.33, -0.43) -3.80 < .001
ML2 =~ BAIT_8_TextIssues -1.80 0.31 (-2.42, -1.19) -5.75 < .001
ML2 =~ BAIT_5_ImitatingReality 1.58 0.31 (0.97, 2.18) 5.13 < .001
ML2 =~ BAIT_6_EnvironmentReal 1.49 0.29 (0.92, 2.07) 5.10 < .001
ML1 =~ BAIT_3_VideosRealistic 1.00 0.00 (1.00, 1.00) < .001
ML1 =~ BAIT_4_VideosIssues -0.79 0.29 (-1.36, -0.22) -2.74 0.006
# Correlation
Link Coefficient SE 95% CI z p
ML2 ~~ ML1 -6.89e-03 2.25e-03 (-0.01, -2.48e-03) -3.06 0.002
# Merge with data
scores <- lavaan::predict(m1) |> 
  as.data.frame() |> 
  data_rename(c("ML2"), c("BAIT_SEM")) |> 
  mutate(Participant = df$Participant) |>
  mutate(BAIT = rowMeans(select(bait, -contains("Videos")), na.rm = TRUE))

df <- full_join(df, scores[c("Participant", "BAIT_SEM", "BAIT")], by="Participant")

We computed two type of general scores for the BAIT scale, an empirical score based on the average of observed data (of the most loading items) and a model-based score as predicted by the structural model. The first one gives equal weight to all items (and keeps the same [0-1] range), while the second one is based on the factor loadings and the covariance structure.

Code
correlation::cor_test(scores, "BAIT_SEM", "BAIT") |> 
  plot() +
  ggside::geom_xsidedensity(aes(x=BAIT_SEM), color="grey", linewidth=1) +
  ggside::geom_ysidedensity(aes(y=BAIT), color="grey", linewidth=1) +
  ggside::scale_xsidey_continuous(expand = c(0, 0)) +
  ggside::scale_ysidex_continuous(expand = c(0, 0)) +
  ggside::theme_ggside_void() +
  theme(ggside.panel.scale = .1) 

While the two correlate substantially, they have different benefits. The empirical score has a more straightforward meaning and is more reproducible (as it is not based on a model fitted on a specific sample), the model-based score takes into account the relative importance of the contribution of each item to their factor.

Corrrelation with GAAIS

Code
table <- correlation::correlation(scores, 
                         select(df, starts_with("GAAIS")),
                         bayesian=TRUE) |> 
  summary()

format(table) |> 
  datawizard::data_rename("Parameter", "Variables") |> 
  gt::gt() |> 
  gt::cols_align(align="center") |> 
  gt::tab_options(column_labels.font.weight="bold")
Variables GAAIS_Negative_10 GAAIS_Positive_17 GAAIS_Positive_7 GAAIS_Negative_15 GAAIS_Negative_9 GAAIS_Positive_12
BAIT_SEM -0.15* 0.25*** 0.30*** -0.11 0.14* 0.23***
ML1 -0.04 -0.04 0.05 -0.16** -0.11 0.06
BAIT 3.25e-03 0.25*** 0.14* 0.02 0.15* 0.12

AI-Expertise

Code
df |> 
  ggplot(aes(x=as.factor(AI_Knowledge), y=BAIT)) +
  geom_boxplot()

Code
# m <- betareg::betareg(BAIT ~ AI_Knowledge, data=df)
m <- lm(BAIT_SEM ~ AI_Knowledge, data=df)
# m <- brms::brm(BAIT ~ mo(AI_Knowledge), data=dfsub, algorithm = "meanfield")
# m <- brms::brm(BAIT ~ AI_Knowledge, data=dfsub, algorithm = "meanfield")
display(parameters::parameters(m))
Parameter Coefficient SE 95% CI t(201) p
(Intercept) -0.01 0.02 (-0.05, 0.02) -0.79 0.432
AI Knowledge 4.00e-03 4.84e-03 (-5.54e-03, 0.01) 0.83 0.409
Code
marginaleffects::predictions(m, by=c("AI_Knowledge"), newdata = "marginalmeans") |> 
  as.data.frame() |> 
  ggplot(aes(x=AI_Knowledge, y=estimate)) +
  geom_jitter2(data=df, aes(y=BAIT_SEM), alpha=0.2, width=0.1) +
  geom_line(aes(group=1), position = position_dodge(width=0.2)) +
  geom_pointrange(aes(ymin = conf.low, ymax=conf.high), position = position_dodge(width=0.2)) +
  theme_minimal() +
  labs(x = "AI-Knowledge", y="BAIT Score")

Gender and Age

Code
# m <- betareg::betareg(BAIT ~ Sex / Age, data=df, na.action=na.omit)
m <- lm(BAIT_SEM ~ Sex / Age, data=df)
display(parameters::parameters(m))
Parameter Coefficient SE 95% CI t(199) p
(Intercept) -0.02 0.04 (-0.09, 0.05) -0.46 0.646
Sex (Male) 0.02 0.04 (-0.07, 0.10) 0.38 0.702
Sex (Female) × Age 1.04e-03 1.04e-03 (-1.02e-03, 3.10e-03) 1.00 0.320
Sex (Male) × Age -5.69e-05 4.72e-04 (-9.87e-04, 8.74e-04) -0.12 0.904

Belief in the Instructions

Code
glm(Feedback_LabelsIncorrect ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_LabelsIncorrect = ifelse(Feedback_LabelsIncorrect=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Labels are Incorrect'")
Predicting ‘Labels are Incorrect’
Parameter Log-Odds SE 95% CI z p
(Intercept) -0.63 0.48 (-1.59, 0.29) -1.32 0.186
BAIT SEM 7.36 6.86 (-5.96, 21.17) 1.07 0.283
AI Knowledge 0.13 0.13 (-0.11, 0.38) 1.04 0.300
BAIT SEM × AI Knowledge -2.81 1.75 (-6.37, 0.55) -1.61 0.108
Code
glm(Feedback_LabelsReversed ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_LabelsReversed = ifelse(Feedback_LabelsReversed=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Labels are reversed'")
Predicting ‘Labels are reversed’
Parameter Log-Odds SE 95% CI z p
(Intercept) -2.43 0.90 (-4.37, -0.81) -2.70 0.007
BAIT SEM -4.79 13.04 (-30.39, 20.29) -0.37 0.714
AI Knowledge -0.04 0.24 (-0.50, 0.44) -0.17 0.867
BAIT SEM × AI Knowledge 0.18 3.34 (-6.41, 6.49) 0.05 0.957
Code
glm(Feedback_CouldDiscriminate ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_CouldDiscriminate = ifelse(Feedback_CouldDiscriminate=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Easy to discriminate'")
Predicting ‘Easy to discriminate’
Parameter Log-Odds SE 95% CI z p
(Intercept) -3.06 1.25 (-5.89, -0.91) -2.45 0.014
BAIT SEM 3.36 18.33 (-33.21, 37.27) 0.18 0.855
AI Knowledge -0.09 0.34 (-0.74, 0.60) -0.27 0.788
BAIT SEM × AI Knowledge -2.02 4.83 (-11.14, 7.11) -0.42 0.676
Code
glm(Feedback_CouldNotDiscriminate ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_CouldNotDiscriminate = ifelse(Feedback_CouldNotDiscriminate=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Hard to discriminate'")
Predicting ‘Hard to discriminate’
Parameter Log-Odds SE 95% CI z p
(Intercept) 0.31 0.48 (-0.63, 1.27) 0.65 0.517
BAIT SEM 6.01 7.03 (-7.63, 20.14) 0.85 0.393
AI Knowledge -0.18 0.13 (-0.44, 0.06) -1.44 0.150
BAIT SEM × AI Knowledge 0.31 1.77 (-3.19, 3.81) 0.18 0.861
Code
glm(Feedback_Fun ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_Fun = ifelse(Feedback_Fun=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Fun'")
Predicting ‘Fun’
Parameter Log-Odds SE 95% CI z p
(Intercept) 1.01 0.50 (0.06, 2.02) 2.04 0.042
BAIT SEM 5.12 7.07 (-8.67, 19.25) 0.72 0.469
AI Knowledge -0.12 0.13 (-0.38, 0.13) -0.95 0.344
BAIT SEM × AI Knowledge -0.47 1.76 (-3.95, 3.00) -0.27 0.787
Code
glm(Feedback_Boring ~ BAIT_SEM * AI_Knowledge, 
    data=mutate(df, Feedback_Boring = ifelse(Feedback_Boring=="True", 1, 0)), 
    family="binomial") |> 
  parameters::parameters() |> 
  display(title="Predicting 'Boring'")
Predicting ‘Boring’
Parameter Log-Odds SE 95% CI z p
(Intercept) -2.22 0.61 (-3.49, -1.07) -3.61 < .001
BAIT SEM -0.31 8.53 (-17.13, 16.45) -0.04 0.971
AI Knowledge 0.25 0.15 (-0.04, 0.56) 1.65 0.099
BAIT SEM × AI Knowledge -0.63 2.09 (-4.79, 3.43) -0.30 0.764

Save

write.csv(df, "../data/data_participants.csv", row.names = FALSE)
write.csv(dftask, "../data/data.csv", row.names = FALSE)